A Table based Q-Learning Reinforcement Agent in A Grid World

This is a simple example of a Q-Learning agent. The Q function is a table, and each decision is made by sampling the Q-values for a particular state thermally.


In [1]:
import numpy as np
import random
import gym

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt

from IPython.display import clear_output
from tqdm import tqdm

In [2]:
env = gym.make('FrozenLake-v0')


[2017-03-24 09:58:26,166] Making new env: FrozenLake-v0

In [40]:
Q = np.zeros([env.observation_space.n, env.action_space.n])

In [60]:
# Set learning parameters
decision_temperature = 0.01
l_rate = 0.5
y = .99
e = 0.1
num_episodes = 900

# create lists to contain total rewawrds and steps per episode

epi_length = []
rs = []

for i in tqdm(range(num_episodes)):
    s = env.reset()
    r_total = 0
    done = False
    number_jumps = 0

    # limit numerb of jumps
    while number_jumps < 99:
        number_jumps += 1

        softmax = np.exp(Q[s]/decision_temperature)
        rand_n = np.random.rand() * np.sum(softmax)
        
        # pick the next action randomly
        acc = 0
        for ind in range(env.action_space.n):
            acc += softmax[ind]
            if acc >= rand_n:
                a = ind
                break
                
        #print(a, softmax, rand_n)
        
        # a = np.argmax(Q[s, :] + np.random.randn(1, env.action_space.n) * (1./(i+1)))

        s_next, r, done, _ = env.step(a)


        Q_next_value = Q[s_next]


        max_Q_next = np.max(Q[s_next,:])

        # now update Q
        Q[s, a] += l_rate * (r + y * max_Q_next \
                                            - Q[s, a])

        r_total += r
        s = s_next
        if done:
            # be more conservative as we learn more
            e = 1./((i/50) + 10)
            break

    if i%900 == 899:

        clear_output(wait=True)
        print("success rate: " + str(sum(rs[-200:])/2) + "%")

        plt.figure(figsize=(8, 8))
        plt.subplot(211)
        plt.title("Jumps Per Episode", fontsize=18)
        plt.plot(epi_length[-200:], "#23aaff")
        plt.subplot(212)
        plt.title('Reward For Each Episode (0/1)', fontsize=18)
        plt.plot(rs[-200:], "o", color='#23aaff', alpha=0.1)
        
        plt.figure(figsize=(6, 6))
        plt.title('Decision Table', fontsize=18)
        plt.xlabel("States", fontsize=15)
        plt.ylabel('Actions', fontsize=15)
        plt.imshow(Q.T)
        plt.show()

    epi_length.append(number_jumps)
    rs.append(r_total)


success rate: 71.5%
100%|██████████| 900/900 [00:01<00:00, 546.26it/s]

In [52]:
def mv_avg(xs, n):
    return [sum(xs[i:i+n])/n for i in range(len(xs)-n)]
# plt.plot(mv_avg(rs, 200))

In [55]:
plt.figure(figsize=(8, 8))
plt.subplot(211)
plt.title("Jumps Per Episode", fontsize=18)
plt.plot(epi_length, "#23aaff", linewidth=0.1, alpha=0.7,
        label="raw data")
plt.plot(mv_avg(epi_length, 200), color="blue", alpha=0.3, linewidth=4, 
         label="Moving Average")
plt.legend(loc=(1.05, 0), frameon=False, fontsize=15)
plt.subplot(212)
plt.title('Reward For Each Episode (0/1)', fontsize=18)
#plt.plot(rs, "o", color='#23aaff', alpha=0.2, markersize=0.4, label="Reward")
plt.plot(mv_avg(rs, 200), color="red", alpha=0.5, linewidth=4, label="Moving Average")
plt.ylim(-0.1, 1.1)
plt.legend(loc=(1.05, 0), frameon=False, fontsize=15)
plt.savefig('./figures/Frozen-Lake-v0-thermal-table.png', dpi=300, bbox_inches='tight')



In [ ]:


In [ ]: